Import Dataset

data <- read.csv('HR_Employee_Data.csv')

Import Libraries

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(dplyr)
library(plotly)
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.1.3
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(plotly)
library(corrly)
library(ecodist)
## Warning: package 'ecodist' was built under R version 4.1.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.1.3
## naivebayes 0.9.7 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:ecodist':
## 
##     distance
## The following object is masked from 'package:kernlab':
## 
##     alpha
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
## Loading required package: rpart
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.3
library(ROCR) 
## Warning: package 'ROCR' was built under R version 4.1.3
library(class)
#install.packages("ggcorrplot")
#install.packages("ROCR")
#install.packages("rpart.plot")
#install.packages('ecodist')
#install.packages("remotes")
#remotes::install_github("kmaheshkulkarni/corrly")
#install.packages("hrbrthemes")
#install.packages("highcharter")
#install.packages("kernlab")
#install.packages("caTools")

Summary of the Data

head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                38%             53%              2
## 2  IND28133                80%             86%              5
## 3  IND07164                11%             88%              7
## 4  IND30478                72%             87%              5
## 5  IND24003                37%             52%              2
## 6  IND08609                41%             50%              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
str(data)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : chr  "38%" "80%" "11%" "72%" ...
##  $ last_evaluation      : chr  "53%" "86%" "88%" "87%" ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
#glimpse(data)
summary(data)
##   ï..Emp_Id         satisfaction_level last_evaluation    number_project 
##  Length:14999       Length:14999       Length:14999       Min.   :2.000  
##  Class :character   Class :character   Class :character   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.803  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :7.000  
##  average_montly_hours time_spend_company Work_accident         left       
##  Min.   : 96.0        Min.   : 2.000     Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:156.0        1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :200.0        Median : 3.000     Median :0.0000   Median :0.0000  
##  Mean   :201.1        Mean   : 3.498     Mean   :0.1446   Mean   :0.2381  
##  3rd Qu.:245.0        3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :310.0        Max.   :10.000     Max.   :1.0000   Max.   :1.0000  
##  promotion_last_5years  Department           salary         
##  Min.   :0.00000       Length:14999       Length:14999      
##  1st Qu.:0.00000       Class :character   Class :character  
##  Median :0.00000       Mode  :character   Mode  :character  
##  Mean   :0.02127                                            
##  3rd Qu.:0.00000                                            
##  Max.   :1.00000

Check for Null Values

cbind(lapply(lapply(data, is.na), sum))
##                       [,1]
## ï..Emp_Id             0   
## satisfaction_level    0   
## last_evaluation       0   
## number_project        0   
## average_montly_hours  0   
## time_spend_company    0   
## Work_accident         0   
## left                  0   
## promotion_last_5years 0   
## Department            0   
## salary                0
sum(is.na(data))
## [1] 0

Data Cleaning

data$satisfaction_level<-gsub("%","",as.character(data$satisfaction_level))
data$satisfaction_level=as.integer(data$satisfaction_level)
head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                 38             53%              2
## 2  IND28133                 80             86%              5
## 3  IND07164                 11             88%              7
## 4  IND30478                 72             87%              5
## 5  IND24003                 37             52%              2
## 6  IND08609                 41             50%              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
data$last_evaluation<-gsub("%","",as.character(data$last_evaluation))
data$last_evaluation=as.integer(data$last_evaluation)
head(data)
##   ï..Emp_Id satisfaction_level last_evaluation number_project
## 1  IND02438                 38              53              2
## 2  IND28133                 80              86              5
## 3  IND07164                 11              88              7
## 4  IND30478                 72              87              5
## 5  IND24003                 37              52              2
## 6  IND08609                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low

#Renaming the Column name in Dataframe

data <- data %>%
  rename(Emp_Id=ï..Emp_Id )
head(data)
##     Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438                 38              53              2
## 2 IND28133                 80              86              5
## 3 IND07164                 11              88              7
## 4 IND30478                 72              87              5
## 5 IND24003                 37              52              2
## 6 IND08609                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low

1)Correlation plot

corr <- round(cor(data[2:9]), 1)
ggcorrplot(corr, lab = TRUE)

2) People who have left in each department

ans=crosstab(data$Department,data$left)
Department=rownames(ans)
fig <- plot_ly(ans,x = ~Department, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on department", yaxis = list(title = 'Count'), barmode = 'group')
fig

3) People who have left based on salary

ans=crosstab(data$salary,data$left)
Salary=rownames(ans)
fig <- plot_ly(ans,x = ~Salary, y = ~X0, type = 'bar', name = 'Working for Company')
fig<- fig %>% add_trace(y =~X1, name = 'Left the Company')
fig <- fig %>% layout(title="Employees who have left based on salary", yaxis = list(title = 'Count'), barmode = 'group')
fig

4) Area plot of time spent in company compared with those who have nand have not left

ans=crosstab(data$time_spend_company,data$left)
ans
##      X0   X1
## 2  3191   53
## 3  4857 1586
## 4  1667  890
## 5   640  833
## 6   509  209
## 7   188    0
## 8   162    0
## 10  214    0
Time_Spent=rownames(ans)
Time_Spent
## [1] "2"  "3"  "4"  "5"  "6"  "7"  "8"  "10"
fig <- plot_ly(ans,x = ~Time_Spent, y = ~X0, type = 'scatter', mode = 'lines', name = 'Working for Company', fill = 'tozeroy')
fig <- fig %>% add_trace(y = ~X1, name = 'Left the Company', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Time Worked'),
         yaxis = list(title = 'Count'))
fig

5) Average monthly working hours according to department

df <- data
head(df)
##     Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438                 38              53              2
## 2 IND28133                 80              86              5
## 3 IND07164                 11              88              7
## 4 IND30478                 72              87              5
## 5 IND24003                 37              52              2
## 6 IND08609                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
str(df)
## 'data.frame':    14999 obs. of  11 variables:
##  $ Emp_Id               : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
data1 <- df%>%
  group_by(Department)%>%
  summarize(Avg_hrs = mean(average_montly_hours))

fig <- plot_ly(data1, x = ~Department, y = ~Avg_hrs, type = 'bar', color = I("dark blue"))
fig <- fig %>% layout(title = "Average monthly working hours according to department",
         xaxis = list(title = "Department"),
         yaxis = list(title = "Average monthly working hours"))
fig
#ggplot(df,aes(x=number_project,y=average_montly_hours))+geom_jitter(aes(color=Department))

6) Number of work accidents for each department

data2 <- df%>%
  filter(Work_accident==1)%>%
  group_by(Department)%>%
  summarize(No_of_wa = n())%>%
  arrange(No_of_wa)
head(data2)
## # A tibble: 6 x 2
##   Department  No_of_wa
##   <chr>          <int>
## 1 hr                89
## 2 accounting        96
## 3 management       103
## 4 product_mng      132
## 5 RandD            134
## 6 marketing        138
hc <- data2 %>% 
  hchart('line', hcaes(x = Department, y = No_of_wa))%>%
  hc_title(text = "Number of work accidents for each department")%>%
  hc_yAxis(title = "Number of work accidents")
hc

7) Density plot of satisfaction level according to salary

l <- df %>% filter(salary == "low")
m <- df %>% filter(salary == "medium")
h <- df %>% filter(salary == "high")
hc2 <- hchart(
  density(l$satisfaction_level), type = "area", 
  color = "steelblue", name = "Low Salary"
  ) %>%
  hc_add_series(
    density(m$satisfaction_level), type = "area",
    color = "#B71C1C", 
    name = "Medium Salary"
    )%>%
  hc_add_series(
    density(h$satisfaction_level), type = "area",
    color = "yellow", 
    name = "High Salary"
    )%>%
  hc_title(text = "Density plot of satisfaction level according to salary")%>%
  hc_xAxis(title = "Satisfaction Level (0-100)")
hc2

8) Time spent per Department

fig <- plot_ly(df, labels = ~Department, values = ~time_spend_company, type = 'pie')
fig <- fig %>% layout(title = 'Time spent per Department',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig

9) Distribution of Satisfaction level

  p <- ggplot(data, aes(x =satisfaction_level))+
      geom_bar(color="darkblue", fill="lightblue")+
      ggtitle("Distribution of Satisfaction level") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
    p

10) Number of projects for each department

fig <- plot_ly(data,x=~Department, y=~number_project,color = ~salary,type="bar")
fig

11) Box plot between satisfaction_level and average_montly_hours.

data %>%
  ggplot(aes(x=satisfaction_level,y=average_montly_hours))+
  geom_boxplot(fill="lightblue")+
  xlab("satisfaction_level")+
  ylab("average_montly_hours")+
  facet_grid(~salary)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

12) Salary distribution

fig <- plot_ly(data,labels = ~salary, values = ~time_spend_company,type="pie", textinfo='label+percent')
fig

13) Promotion in last 5 years vs number of projects

fig <- plot_ly(data, x = ~promotion_last_5years, y = ~number_project ,type = 'bar', color=~Department)
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

ML MODELS:

1)DT MODEL

set.seed(234)
dataDT=data[2:11]
smpl<-sample(2,nrow(dataDT),replace=T,prob=c(0.8,0.2))
train<-dataDT[smpl==1,]
test<-dataDT[smpl==2, ]
fit <- rpart(left~., data = train, method = 'class')
rpart.plot(fit, extra = 106)

Prediction

predict_unseen <-predict(fit, test, type = 'class')
table_mat <- table(test$left, predict_unseen)
table_mat
##    predict_unseen
##        0    1
##   0 2270   26
##   1   58  647
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.9720093

Accuracy: [1] 0.9720093

2)Naive Bayes

data$left<-as.factor(data$left)
#data$rank<-as.factor(data$rank)
str(data)
## 'data.frame':    14999 obs. of  11 variables:
##  $ Emp_Id               : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
dataNB<-data[2:11]
#pairs.panels(data) 
#cor(data$gre,data$gpa)
set.seed(234)
smpl<-sample(2,nrow(dataNB),replace=T,prob=c(0.8,0.2))
train<-dataNB[smpl==1,]
test<-dataNB[smpl==2, ]
mdl<-naive_bayes(left~ .,data=train)
#mdl

NB Prediction

#plot(mdl)
p<-predict(mdl,train,type='prob')
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
head(cbind(p,train))
##             0         1 satisfaction_level last_evaluation number_project
## 1 0.147803194 0.8521968                 38              53              2
## 2 0.734647017 0.2653530                 80              86              5
## 3 0.001099585 0.9989004                 11              88              7
## 4 0.486081254 0.5139187                 72              87              5
## 5 0.139957478 0.8600425                 37              52              2
## 6 0.154630169 0.8453698                 41              50              2
##   average_montly_hours time_spend_company Work_accident left
## 1                  157                  3             0    1
## 2                  262                  6             0    1
## 3                  272                  4             0    1
## 4                  223                  5             0    1
## 5                  159                  3             0    1
## 6                  153                  3             0    1
##   promotion_last_5years Department salary
## 1                     0      sales    low
## 2                     0      sales medium
## 3                     0      sales medium
## 4                     0      sales    low
## 5                     0      sales    low
## 6                     0      sales    low
p1<-predict(mdl,train)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(tab1<-table(p1,train$left))
##    
## p1     0    1
##   0 7369  739
##   1 1763 2127
accuracy=sum(diag(tab1))/sum(tab1)
accuracy
## [1] 0.7914652

Accuracy : [1] 0.7914652

3)SVM Model

df2 <- df
df2$left <- as.factor(df2$left)
str(df2)
## 'data.frame':    14999 obs. of  11 variables:
##  $ Emp_Id               : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
df2$Emp_Id <- NULL
set.seed(234)
split <- sample.split(df2, SplitRatio = 0.7)
split
##  [1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
train <- subset(df2, split == "TRUE")
test <- subset(df2, split == "FALSE")
classifier = svm(formula = left ~ .,
                 data = train,
                 type = 'C-classification',
                 kernel = 'linear')
y_pred = predict(classifier, newdata = test[-7])
y_train_pred = predict(classifier, newdata = train[-7])
cm = table(test[, 7], y_pred)
cm
##    y_pred
##        0    1
##   0 3208  220
##   1  792  280
cm2 = table(train[, 7], y_train_pred )
cm2
##    y_train_pred
##        0    1
##   0 7525  475
##   1 1864  635
sum(diag(cm))/sum(cm)
## [1] 0.7751111

Accuracy: [1] 0.7751111

4)Random Forest Model

# Splitting data in train and test data
# Fitting Random Forest to the train dataset
set.seed(120)  # Setting seed
classifier_RF = randomForest(x = train[-7],
                             y = train$left,
                             ntree = 50)
classifier_RF
## 
## Call:
##  randomForest(x = train[-7], y = train$left, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.01%
## Confusion matrix:
##      0    1 class.error
## 0 7986   14  0.00175000
## 1   92 2407  0.03681473
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = test[-7])
# Confusion Matrix
confusion_mtx = table(test[, 7], y_pred)
confusion_mtx
##    y_pred
##        0    1
##   0 3422    6
##   1   44 1028
# Plotting model
plot(classifier_RF)

# Importance plot
importance(classifier_RF)
##                       MeanDecreaseGini
## satisfaction_level         1354.816274
## last_evaluation             405.770829
## number_project              678.604828
## average_montly_hours        552.729814
## time_spend_company          702.646083
## Work_accident                18.387495
## promotion_last_5years         2.336715
## Department                   40.308236
## salary                       31.676829
# Variable importance plot
varImpPlot(classifier_RF)

sum(diag(confusion_mtx))/sum(confusion_mtx)
## [1] 0.9888889

Accuracy: [1] 0.9888889

5)Logistic Regression

# Loading package


set.seed(234)   
# Splitting dataset
split <- sample.split(data, SplitRatio = 0.8)
split
##  [1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
train<- subset(data[2:11], split == "TRUE")
test<- subset(data[2:11], split == "FALSE")
   
# Training model
logistic_model <- glm(left ~ satisfaction_level + last_evaluation, data = train, family = "binomial")
logistic_model
## 
## Call:  glm(formula = left ~ satisfaction_level + last_evaluation, family = "binomial", 
##     data = train)
## 
## Coefficients:
##        (Intercept)  satisfaction_level     last_evaluation  
##           0.603535           -0.038676            0.005413  
## 
## Degrees of Freedom: 10908 Total (i.e. Null);  10906 Residual
## Null Deviance:       11980 
## Residual Deviance: 10300     AIC: 10310
# Summary
summary(logistic_model)
## 
## Call:
## glm(formula = left ~ satisfaction_level + last_evaluation, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4646  -0.7053  -0.5003  -0.3326   2.3030  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.603535   0.111930   5.392 6.96e-08 ***
## satisfaction_level -0.038676   0.001027 -37.669  < 2e-16 ***
## last_evaluation     0.005413   0.001409   3.841 0.000123 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11977  on 10908  degrees of freedom
## Residual deviance: 10300  on 10906  degrees of freedom
## AIC: 10306
## 
## Number of Fisher Scoring iterations: 4
# Predict test data based on model
predict<- predict(logistic_model,test, type = "response")
#predict  

# Changing probabilities
predict<- ifelse(predict>0.5, 1, 0)
   
# Evaluating model accuracy
# using confusion matrix
table(test$left, predict)
##    predict
##        0    1
##   0 2910  207
##   1  729  244
missing_classerr <- mean(predict != test$left)
print(paste('Accuracy =', 1 - missing_classerr))
## [1] "Accuracy = 0.771149144254279"
# ROC-AUC Curve
ROCPred <- prediction(predict, test$left) 
ROCPer <- performance(ROCPred, measure = "tpr", x.measure = "fpr")
   
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.5921804
# Plotting curve
plot(ROCPer)

plot(ROCPer, colorize = TRUE,print.cutoffs.at = seq(0.1, by = 0.1),main = "ROC CURVE")
abline(a = 0, b = 1)
   
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)

Accuracy : 0.771149144254279

6)KNN

f=data$left
data = subset(data, select = -c(left) )
data$left <- f
data_df<-data
data$Department= as.numeric(as.factor(data$Department))
data$salary= as.numeric(as.factor(data$salary))
head(data_df)
##     Emp_Id satisfaction_level last_evaluation number_project
## 1 IND02438                 38              53              2
## 2 IND28133                 80              86              5
## 3 IND07164                 11              88              7
## 4 IND30478                 72              87              5
## 5 IND24003                 37              52              2
## 6 IND08609                 41              50              2
##   average_montly_hours time_spend_company Work_accident promotion_last_5years
## 1                  157                  3             0                     0
## 2                  262                  6             0                     0
## 3                  272                  4             0                     0
## 4                  223                  5             0                     0
## 5                  159                  3             0                     0
## 6                  153                  3             0                     0
##   Department salary left
## 1      sales    low    1
## 2      sales medium    1
## 3      sales medium    1
## 4      sales    low    1
## 5      sales    low    1
## 6      sales    low    1
set.seed(123)
split <- sample.split(data, SplitRatio = 0.7)
train_cl <- subset(data, split == "TRUE")
test_cl <- subset(data, split == "FALSE")
  
# Feature Scaling
train<- scale(train_cl[, 2:10])
test<- scale(test_cl[, 2:10])


# Fitting KNN Model 
# to training dataset
# K = 3
classifier_knn <- knn(train = train,
                      test = test,
                      cl = train_cl$left,
                      k = 3)

# Confusiin Matrix
cm <- table(test_cl$left, classifier_knn)
cm
##    classifier_knn
##        0    1
##   0 3973  182
##   1   92 1207
# Model Evaluation - Choosing K
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$left)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.94976164283095"

Accuracy: 0.949761

KNN prediction

#head(data)
satisfaction_level<-57
last_evaluation<-34
number_project<-3
average_montly_hours<-130
time_spend_company<-3
Work_accident<-0
promotion_last_5years<-1

s1<-levels(factor(data_df$Department))
s1
##  [1] "accounting"  "hr"          "IT"          "management"  "marketing"  
##  [6] "product_mng" "RandD"       "sales"       "support"     "technical"
s2<-as.numeric(levels(factor(data$Department)))
s2
##  [1]  1  2  3  4  5  6  7  8  9 10
Department<-"IT"
p<-as.numeric(match(Department,s1))
Department=s2[p]
Department
## [1] 3
a1<-levels(factor(data_df$salary))
a1
## [1] "high"   "low"    "medium"
a2<-as.numeric(levels(factor(data$salary)))
a2
## [1] 1 2 3
salary<-"medium"
p1<-as.numeric(match(salary,a1))
salary=a2[p1]
salary
## [1] 3
l1<-levels(factor(data_df$left))
l1
## [1] "0" "1"
l2<-as.numeric(levels(factor(data$left)))
l2
## [1] 0 1
x=c(satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Department,salary)

classifier_knn <- knn(train = train,
                      test = test,
                      cl = train_cl$left,
                      k = 3)


z<-knn(train=train_cl[,2:10],test = x,cl = train_cl$left, k = 3)
z1<-as.numeric(match(z,l2))
cat("Left :",l2[z1])
## Left : 0